Das Ziel ist es, aus dem Datacamp Datensatz Soccer Data, welcher Daten aus der höchsten englischen Fussballdivision beinhaltet, folgende Fragestellung / Hypothese zu beantworten:

Die Manschaft, die zur Halbzeit vorne liegt, gewinnt mit einer Chance von mindestens 75% das Spiel. Falls zur Halbzeit unentschieden ist, gewinnt eher das Heimteam mit einer Chance von mindestens 33.4%.

Als Einführung werden wir auf Datacamp folgende Kurse durchgehen:

# Import libraries
library("plotly")
library("plyr")
library("dplyr")
library("forcats")
library("RColorBrewer")

Daten einlesen und Dataframe erstellen

# List files in folder "data"
files <- list.files(path="./data/", pattern=NULL, all.files=FALSE, full.names=TRUE)

# Create Dataframe with all csv from years 2015-2019
df <- ldply(.data = files, .fun = read.csv)

View(df)

Hier zählen wir, wie oft das Heim - und Auswärtsteam zur Halb - und Vollzeit gewinnen oder ob das Spiel unentschieden ist.

# Create dataframe for halftime & fulltime results and count frequency 
df_htr <- df %>% count(HTR)
df_ftr <- df %>% count(FTR)

# Halftime
df_htr
# Fulltime
df_ftr
# Create dataframe with halftime & fulltime result frequency
df_results <- data.frame(c("Away win", "Draw", "Home win"), c(df_htr$n), c(df_ftr$n))

# Rename column headers
col_headings <- c('Result','Halftime','Fulltime')
names(df_results) <- col_headings

df_results
# Plot grouped bar chart to visualize halftime & fulltime results
fig <- plot_ly(
  df_results, x = ~Result, y = ~Halftime, type = 'bar', name = 'Halftime Score') %>% 
  add_trace(y = ~Fulltime, name = 'Fulltime Score') %>%
  layout(yaxis = list(title = 'Amount'), 
         barmode = 'group',
         width = 600, height = 500)
Warning: Specifying width/height in layout() is now deprecated.
Please specify in ggplotly() or plot_ly()
fig
# Merge HTR & FTR to new column 'result'
df$result <- paste(df$HTR, df$FTR)

# Example: H H = home team is winning at halftime and also wins the game at fulltime

df[,"result", drop=FALSE]
# Plot all different game progresses and their amount
df_count_results <- df %>%
  count(result)
  
df_count_results %>%
  mutate(result = fct_reorder(result, n, .desc = TRUE)) %>%
  plot_ly(x = ~result, y = ~n, text = ~n, textposition = 'auto') %>%
  add_bars() %>%
  layout(xaxis = list(title = "Game Progress"),
         yaxis = list(title = "Amount of Game Progresses"),
         title = "How are the different game progresses distributed?",
         width = 800, height = 500)
Warning: Specifying width/height in layout() is now deprecated.
Please specify in ggplotly() or plot_ly()

Hier wollen wir herausfinden, wie wahrscheinlich die 9 möglichen Spielausgängen sind bevor das Spiel überhaupt beginnt.

# Group by game outcome & calculate probability of all outcomes
df_count_results_prob <- df %>% 
  group_by(result) %>% 
  summarise(count_result = round(n() / nrow(df) * 100, digits = 2))

# Plot all different game progresses and their probability
df_count_results_prob %>%
  mutate(result = fct_reorder(result, count_result, .desc = TRUE)) %>%
  plot_ly(x = ~result, y = ~count_result, text = ~count_result, textposition = 'auto') %>%
  add_bars() %>%
  layout(xaxis = list(title = "Game Progress"),
         yaxis = list(title = "Probability of Game Progress (%)"),
         title = "How are the different game progresses distributed?",
         width = 800, height = 500)
Warning: Specifying width/height in layout() is now deprecated.
Please specify in ggplotly() or plot_ly()
# Group by game outcome & calculate probability of all outcomes
df_count_results <- df %>% 
  group_by(result) %>% 
  summarise(count_result = round(n() / nrow(df) * 100, digits = 2))

df_count_results %>%
  plot_ly(labels = ~result, values = ~count_result) %>%
  add_pie(hole = 0.4, color = I("white")) %>%
  layout(xaxis = list(title = "Game Progress"),
         yaxis = list(title = "Probability %"),
         title = "What is the probability of each game progress?")
# Calculate probability between halftime & fulltime away / draw / home results
calc_prob <- function(df1, df2) {
  prob <- round((100 / nrow(df1) * nrow(df2)), digits = 2)
  return(prob)
}
# Filter home teams winning at halftime
df_ht_home <- df %>% 
  filter(HTR == "H")

# Filter home teams winning at halftime & fulltime
df_ft_home <- df_ht_home %>% 
  filter(FTR == "H")

home_win_prob <- calc_prob(df_ht_home, df_ft_home)

cat("Probability that the home team wins the game if they are leading at half time: ", home_win_prob, "%")
Probability that the home team wins the game if they are leading at half time:  82.55 %
# Filter away teams winning at halftime
df_ht_away <- df %>% 
  filter(HTR == "A")

# Filter away teams winning at halftime & fulltime
df_ft_away <- df_ht_away %>% 
  filter(FTR == "A")

away_win_prob <- calc_prob(df_ht_away, df_ft_away)

cat("Probability that the away team wins the game if they are leading at half time: ", away_win_prob, "%")
Probability that the away team wins the game if they are leading at half time:  72.03 %
# Filter draw at halftime
df_ht_draw <- df %>% 
  filter(HTR == "D")

# Filter draw at halftime & fulltime
df_ft_draw <- df_ht_draw %>% 
  filter(FTR == "D")

draw_prob <- calc_prob(df_ht_draw, df_ft_draw)

cat("Probability that the game ends in a draw if the halftime result is also a draw: ", draw_prob, "%")
Probability that the game ends in a draw if the halftime result is also a draw:  36.45 %
# Filter draw at halftime & the home team winning at fulltime
df_ht_draw_ft_home_win <- df_ht_draw %>%
  filter(FTR == "H")

home_win_after_ht_draw_prob <- calc_prob(df_ht_draw, df_ht_draw_ft_home_win)

cat("Probability that the home team wins if the halftime result is a draw: ", home_win_after_ht_draw_prob, "%")
Probability that the home team wins if the halftime result is a draw:  38.03 %

Bestätigung der Hypothese

Somit können wir aus die 2 Wahrscheinlichkeiten “home_win_prob” und “away_win_prob” unsere Hypothese wie folgt bestätigen:

# Probability that the team winning at half time wins the game
ht_ft_win_prob <- round(((home_win_prob * nrow(df_ft_home)) + (away_win_prob * nrow(df_ft_away))) / (nrow(df_ft_home) + nrow(df_ft_away)), digits = 2)

cat("Probability that the team leading at half time wins the entire game: ", ht_ft_win_prob, "%")
Probability that the team leading at half time wins the entire game:  78.41 %
fig <- plot_ly(
  y = c("Home wins after leading at HT", "Away wins after leading at HT", "Draw at FT & HT"), 
  x = c(home_win_prob, away_win_prob, draw_prob),
  type = "bar"
)

fig <- fig %>% layout(title = "Game Progress Overview",
         xaxis = list(title = "Probability"))

fig
NA

Fragestellung 2:

Das Heimteam schiesst mehr aufs Tor als das Auswärtsteam, aber welches Team ist effizienter?

# Plot Fulltime Home Shots vs Home Goals 
p1 <- df %>%
  plot_ly(x = ~FTHG, y = ~HS, coloraxis = 'coloraxis') %>%
  add_histogram2d(nbinsy = 40)

# Plot Fulltime Away Shots vs Away Goals 
p2 <- df %>%
  plot_ly(x = ~FTAG, y = ~AS, coloraxis = 'coloraxis') %>%
  add_histogram2d(nbinsy = 40)

# Add both plots together to build subplot
subplot(p1, p2, nrows = 1, shareX = FALSE, shareY = FALSE) %>%
  layout(
    title = "Goals vs Shots Overview",
    xaxis = list(title = "Home Goals"),
    xaxis2 = list(title = "Away Goals"),
    yaxis = list(title = "Home Shots"),
    yaxis2 = list(title = "Away Shots"),
    coloraxis=list(colorscale='Jet')
  )

Anhand des Plots oben ist gut zu sehen, dass das Heimteam eher 1-2 Tore schiesst und 9-16 Torschüsse aufweist. Beim Auswärtsteam siehts etwas anders aus; Sie schiessen eher 0-1 Tor und weisen 6-11 Torschüsse auf.

# Fit the regression model of Fulltime Away Goals on Away Shots
m <- lm(FTAG ~ AS, data = df)

# Create the scatterplot with smoother
df %>%
  plot_ly(x = ~AS, y = ~FTAG) %>%
  add_markers(showlegend = FALSE) %>%
  add_lines(y = ~fitted(m))
df %>%
  plot_ly(x = ~HS, y = ~AS, coloraxis = 'coloraxis') %>%
  add_histogram2d(nbinsx = 70, nbinsy = 60) %>%
  layout(coloraxis=list(colorscale='Jet'))
df_efficiency <- df %>%
  summarise(
    "Home Goals per Shot" = round(sum(FTHG) / sum(HS),digits = 3),
    "Home Goals per Shot on Target" = round(sum(FTHG) / sum(HST),digits = 3),
    "Away Goals per Shot" = round(sum(FTAG) / sum(AS),digits = 3),
    "Away Goals per Shot on Target" = round(sum(FTAG) / sum(AST),digits = 3)
)
# Transpose dataframe
t_df_efficiency <- data.frame("Percent" = t(df_efficiency))
fig <- plot_ly(
  y = c("Home Goals per Shot", "Home Goals per Shot on Target", "Away Goals per Shot", "Away Goals per Shot on Target"),
  x = t_df_efficiency$Percent,
  type = "bar"
)

fig <- fig %>% layout(title = "Team Efficiency",
         xaxis = list(title = "Probability"))

fig
---
title: "Data Visualization mit Plotly"
output: html_notebook
---

Das Ziel ist es, aus dem Datacamp Datensatz [Soccer Data](https://app.datacamp.com/workspace/datasets/dataset-python-soccer), welcher Daten aus der höchsten englischen Fussballdivision beinhaltet, folgende Fragestellung / Hypothese zu beantworten:


### Die Manschaft, die zur Halbzeit vorne liegt, gewinnt mit einer Chance von mindestens 75% das Spiel. Falls zur Halbzeit unentschieden ist, gewinnt eher das Heimteam mit einer Chance von mindestens 33.4%.


Als Einführung werden wir auf Datacamp folgende Kurse durchgehen:

- [Interactive Data Visualization with plotly](https://app.datacamp.com/learn/courses/interactive-data-visualization-with-plotly-in-r)

- [Intermediate Interactive Data Visualization with plotly](https://app.datacamp.com/learn/courses/interactive-data-visualization-with-plotly-in-r)


```{r}
# Import libraries
library("plotly")
library("plyr")
library("dplyr")
library("forcats")
library("RColorBrewer")
```

### Daten einlesen und Dataframe erstellen

```{r}
# List files in folder "data"
files <- list.files(path="./data/", pattern=NULL, all.files=FALSE, full.names=TRUE)

# Create Dataframe with all csv from years 2015-2019
df <- ldply(.data = files, .fun = read.csv)

View(df)
```

Hier zählen wir, wie oft das Heim - und Auswärtsteam zur Halb - und Vollzeit gewinnen oder ob das Spiel unentschieden ist.

- A = Auswärtsteam gewinnt

- D = Unentschieden

- H = Heimteam gewinnt

```{r}
# Create dataframe for halftime & fulltime results and count frequency 
df_htr <- df %>% count(HTR)
df_ftr <- df %>% count(FTR)

# Halftime
df_htr
# Fulltime
df_ftr
```

```{r}
# Create dataframe with halftime & fulltime result frequency
df_results <- data.frame(c("Away win", "Draw", "Home win"), c(df_htr$n), c(df_ftr$n))

# Rename column headers
col_headings <- c('Result','Halftime','Fulltime')
names(df_results) <- col_headings

df_results
```
```{r}
# Plot grouped bar chart to visualize halftime & fulltime results
fig <- plot_ly(
  df_results, x = ~Result, y = ~Halftime, type = 'bar', name = 'Halftime Score') %>% 
  add_trace(y = ~Fulltime, name = 'Fulltime Score') %>%
  layout(yaxis = list(title = 'Amount'), 
         barmode = 'group',
         width = 600, height = 500)

fig
```


```{r}
# Merge HTR & FTR to new column 'result'
df$result <- paste(df$HTR, df$FTR)

# Example: H H = home team is winning at halftime and also wins the game at fulltime

df[,"result", drop=FALSE]
```

```{r}
# Plot all different game progresses and their amount
df_count_results <- df %>%
  count(result)
  
df_count_results %>%
  mutate(result = fct_reorder(result, n, .desc = TRUE)) %>%
  plot_ly(x = ~result, y = ~n, text = ~n, textposition = 'auto') %>%
  add_bars() %>%
  layout(xaxis = list(title = "Game Progress"),
         yaxis = list(title = "Amount of Game Progresses"),
         title = "How are the different game progresses distributed?",
         width = 800, height = 500)
```

Hier wollen wir herausfinden, wie wahrscheinlich die 9 möglichen Spielausgängen sind bevor das Spiel überhaupt beginnt.

```{r}
# Group by game outcome & calculate probability of all outcomes
df_count_results_prob <- df %>% 
  group_by(result) %>% 
  summarise(count_result = round(n() / nrow(df) * 100, digits = 2))

# Plot all different game progresses and their probability
df_count_results_prob %>%
  mutate(result = fct_reorder(result, count_result, .desc = TRUE)) %>%
  plot_ly(x = ~result, y = ~count_result, text = ~count_result, textposition = 'auto') %>%
  add_bars() %>%
  layout(xaxis = list(title = "Game Progress"),
         yaxis = list(title = "Probability of Game Progress (%)"),
         title = "How are the different game progresses distributed?",
         width = 800, height = 500)
```

```{r}
# Group by game outcome & calculate probability of all outcomes
df_count_results <- df %>% 
  group_by(result) %>% 
  summarise(count_result = round(n() / nrow(df) * 100, digits = 2))

df_count_results %>%
  plot_ly(labels = ~result, values = ~count_result) %>%
  add_pie(hole = 0.4, color = I("white")) %>%
  layout(xaxis = list(title = "Game Progress"),
         yaxis = list(title = "Probability %"),
         title = "What is the probability of each game progress?")
```
```{r}
# Calculate probability between halftime & fulltime away / draw / home results
calc_prob <- function(df1, df2) {
  prob <- round((100 / nrow(df1) * nrow(df2)), digits = 2)
  return(prob)
}
```

```{r}
# Filter home teams winning at halftime
df_ht_home <- df %>% 
  filter(HTR == "H")

# Filter home teams winning at halftime & fulltime
df_ft_home <- df_ht_home %>% 
  filter(FTR == "H")

home_win_prob <- calc_prob(df_ht_home, df_ft_home)

cat("Probability that the home team wins the game if they are leading at half time: ", home_win_prob, "%")
```
```{r}
# Filter away teams winning at halftime
df_ht_away <- df %>% 
  filter(HTR == "A")

# Filter away teams winning at halftime & fulltime
df_ft_away <- df_ht_away %>% 
  filter(FTR == "A")

away_win_prob <- calc_prob(df_ht_away, df_ft_away)

cat("Probability that the away team wins the game if they are leading at half time: ", away_win_prob, "%")
```

```{r}
# Filter draw at halftime
df_ht_draw <- df %>% 
  filter(HTR == "D")

# Filter draw at halftime & fulltime
df_ft_draw <- df_ht_draw %>% 
  filter(FTR == "D")

draw_prob <- calc_prob(df_ht_draw, df_ft_draw)

cat("Probability that the game ends in a draw if the halftime result is also a draw: ", draw_prob, "%")
```

```{r}
# Filter draw at halftime & the home team winning at fulltime
df_ht_draw_ft_home_win <- df_ht_draw %>%
  filter(FTR == "H")

home_win_after_ht_draw_prob <- calc_prob(df_ht_draw, df_ht_draw_ft_home_win)

cat("Probability that the home team wins if the halftime result is a draw: ", home_win_after_ht_draw_prob, "%")
```
### Bestätigung der Hypothese

Somit können wir aus die 2 Wahrscheinlichkeiten "home_win_prob" und "away_win_prob" unsere Hypothese wie folgt bestätigen: 

```{r}
# Probability that the team winning at half time wins the game
ht_ft_win_prob <- round(((home_win_prob * nrow(df_ft_home)) + (away_win_prob * nrow(df_ft_away))) / (nrow(df_ft_home) + nrow(df_ft_away)), digits = 2)

cat("Probability that the team leading at half time wins the entire game: ", ht_ft_win_prob, "%")
```

```{r}
fig <- plot_ly(
  y = c("Home wins after leading at HT", "Away wins after leading at HT", "Draw at FT & HT"), 
  x = c(home_win_prob, away_win_prob, draw_prob),
  type = "bar"
)

fig <- fig %>% layout(title = "Game Progress Overview",
         xaxis = list(title = "Probability"))

fig

```
### Fragestellung 2:
### Das Heimteam schiesst mehr aufs Tor als das Auswärtsteam, aber welches Team ist effizienter?

```{r}
# Plot Fulltime Home Shots vs Home Goals 
p1 <- df %>%
  plot_ly(x = ~FTHG, y = ~HS, coloraxis = 'coloraxis') %>%
  add_histogram2d(nbinsy = 40)

# Plot Fulltime Away Shots vs Away Goals 
p2 <- df %>%
  plot_ly(x = ~FTAG, y = ~AS, coloraxis = 'coloraxis') %>%
  add_histogram2d(nbinsy = 40)

# Add both plots together to build subplot
subplot(p1, p2, nrows = 1, shareX = FALSE, shareY = FALSE) %>%
  layout(
    title = "Goals vs Shots Overview",
    xaxis = list(title = "Home Goals"),
    xaxis2 = list(title = "Away Goals"),
    yaxis = list(title = "Home Shots"),
    yaxis2 = list(title = "Away Shots"),
    coloraxis=list(colorscale='Jet')
  )
```
Anhand des Plots oben ist gut zu sehen, dass das Heimteam eher 1-2 Tore schiesst und 9-16 Torschüsse aufweist. Beim Auswärtsteam siehts etwas anders aus; Sie schiessen eher 0-1 Tor und weisen 6-11 Torschüsse auf.

```{r}
# Fit the regression model of Fulltime Away Goals on Away Shots
m <- lm(FTAG ~ AS, data = df)

# Create the scatterplot with smoother
df %>%
  plot_ly(x = ~AS, y = ~FTAG) %>%
  add_markers(showlegend = FALSE) %>%
  add_lines(y = ~fitted(m))
```
```{r}
df %>%
  plot_ly(x = ~HS, y = ~AS, coloraxis = 'coloraxis') %>%
  add_histogram2d(nbinsx = 70, nbinsy = 60) %>%
  layout(coloraxis=list(colorscale='Jet'))
```

```{r}
df_efficiency <- df %>%
  summarise(
    "Home Goals per Shot" = round(sum(FTHG) / sum(HS),digits = 3),
    "Home Goals per Shot on Target" = round(sum(FTHG) / sum(HST),digits = 3),
    "Away Goals per Shot" = round(sum(FTAG) / sum(AS),digits = 3),
    "Away Goals per Shot on Target" = round(sum(FTAG) / sum(AST),digits = 3)
)
# Transpose dataframe
t_df_efficiency <- data.frame("Percent" = t(df_efficiency))
```

```{r}
fig <- plot_ly(
  y = c("Home Goals per Shot", "Home Goals per Shot on Target", "Away Goals per Shot", "Away Goals per Shot on Target"),
  x = t_df_efficiency$Percent,
  type = "bar"
)

fig <- fig %>% layout(title = "Team Efficiency",
         xaxis = list(title = "Probability"))
fig
```


